services:
  nemollm-inference:
    container_name: nemollm-inference-microservice
    image: nvcr.io/ohlfw0olaadg/ea-participants/nim_llm:24.02
    volumes:
    - ${MODEL_DIRECTORY:?please update the env file and source it before running}:/model-store
    command: nemollm_inference_ms --model ${APP_LLM_MODELNAME:-mixtral-8x7b-instruct} --openai_port 9999 --nemo_port 9998 --num_gpus=${NUM_GPU:-1}
    ports:
    - "9999:9999"
    expose:
    - "9999"
    shm_size: 20gb
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: ${INFERENCE_GPU_COUNT:-all}
              capabilities: [gpu]

  nemollm-embedding:
    container_name: nemo-retriever-embedding-microservice
    image: nvcr.io/ohlfw0olaadg/ea-participants/nemo-retriever-embedding-microservice:24.02
    volumes:
    - ${EMBEDDING_MODEL_DIRECTORY:?please update the env file and source it before running}:/model-checkpoint-path
    command:  bin/web -p 9080 -c /model-checkpoint-path/${EMBEDDING_MODEL_CKPT_NAME} -g model_config_templates/${EMBEDDING_MODEL_NAME}_template.yaml
    ports:
    - "9080:9080"
    expose:
    - "9080"
    shm_size: 8gb
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              # count: ${INFERENCE_GPU_COUNT:-all}
              device_ids: ['${EMBEDDING_MS_GPU_ID:-0}']
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9080/v1/health/ready"]
      interval: 30s
      timeout: 20s
      retries: 3
      start_period: 10m

networks:
  default:
    name: nvidia-rag
